library(tidyverse)
library(lubridate)
library(tidyquant)
library(reticulate)
library(plotly)
# Import dataset--------------------------------
data <- read_csv("data/ICD10_F43.2_patients.csv")
data_tib <- as_tibble(data)

# Data wrangling-------------------------------

# Replace sex variables with more descriptive names
data_tib$Sex <- as.character(data_tib$Sex)
data_tib$Sex[data_tib$Sex == "f"] <- "Female"
data_tib$Sex[data_tib$Sex == "m"] <- "Male"
data_tib$Sex <- as.factor(data_tib$Sex)

data_tib <- data_tib %>% 
            mutate(Birth_year = year(`date-of-birth`)) %>%
            glimpse()
## Observations: 1,212
## Variables: 6
## $ id               <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Sex              <fct> Female, Male, Female, Female, Male, Female, Female...
## $ `date-of-birth`  <date> 1989-11-02, 1972-01-29, 1968-10-05, 1982-08-24, 1...
## $ `total-sessions` <dbl> 11, 18, 20, 10, 31, 14, 13, 10, 34, 12, 30, 13, 15...
## $ `therapy-year`   <dbl> 2020, 2019, 2020, 2020, 2008, 2020, 2020, 2020, 20...
## $ Birth_year       <dbl> 1989, 1972, 1968, 1982, 1971, 2002, 1985, 2001, 19...
data_tib <- data_tib %>% 
    mutate(age = `therapy-year` - Birth_year) %>%
    glimpse()
## Observations: 1,212
## Variables: 7
## $ id               <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Sex              <fct> Female, Male, Female, Female, Male, Female, Female...
## $ `date-of-birth`  <date> 1989-11-02, 1972-01-29, 1968-10-05, 1982-08-24, 1...
## $ `total-sessions` <dbl> 11, 18, 20, 10, 31, 14, 13, 10, 34, 12, 30, 13, 15...
## $ `therapy-year`   <dbl> 2020, 2019, 2020, 2020, 2008, 2020, 2020, 2020, 20...
## $ Birth_year       <dbl> 1989, 1972, 1968, 1982, 1971, 2002, 1985, 2001, 19...
## $ age              <dbl> 31, 47, 52, 38, 37, 18, 35, 19, 23, 23, 27, 55, 29...
# How many males in dataset?
males_tib <- data_tib %>% 
    filter(data_tib$Sex == "m") %>% 
    glimpse()
## Observations: 0
## Variables: 7
## $ id               <dbl> 
## $ Sex              <fct> 
## $ `date-of-birth`  <date> 
## $ `total-sessions` <dbl> 
## $ `therapy-year`   <dbl> 
## $ Birth_year       <dbl> 
## $ age              <dbl>
# How many females in dataset?
females_tib <- data_tib %>% 
    filter(data_tib$Sex == "f") %>% 
    glimpse()
## Observations: 0
## Variables: 7
## $ id               <dbl> 
## $ Sex              <fct> 
## $ `date-of-birth`  <date> 
## $ `total-sessions` <dbl> 
## $ `therapy-year`   <dbl> 
## $ Birth_year       <dbl> 
## $ age              <dbl>
# Visualise the dataset
fig <- ggplot(data = data_tib, aes(x = age, y = `total-sessions`, color = Sex)) + 
    geom_point(alpha = 0.5) + 
    labs(
        title = "Visualisation of data set",
        subtitle = "Therapy sessions according to age and sex",
        x_lab= "Age",
        y_lab="Sessions")+
    theme_tq()

# Add interactivity
ggplotly(fig)
# Remove outliers
# Remove all ages less than 13 and more than 70
# Remove all data where 4 < total-sessions <= 20

data_tib <- data_tib %>% 
            filter(age > 12, age < 71, `total-sessions`<= 20,`total-sessions`>4) %>% 
            glimpse()
## Observations: 1,108
## Variables: 7
## $ id               <dbl> 1, 2, 3, 4, 6, 7, 8, 10, 12, 13, 14, 16, 17, 18, 1...
## $ Sex              <fct> Female, Male, Female, Female, Female, Female, Fema...
## $ `date-of-birth`  <date> 1989-11-02, 1972-01-29, 1968-10-05, 1982-08-24, 2...
## $ `total-sessions` <dbl> 11, 18, 20, 10, 14, 13, 10, 12, 13, 15, 12, 14, 20...
## $ `therapy-year`   <dbl> 2020, 2019, 2020, 2020, 2020, 2020, 2020, 2019, 20...
## $ Birth_year       <dbl> 1989, 1972, 1968, 1982, 2002, 1985, 2001, 1996, 19...
## $ age              <dbl> 31, 47, 52, 38, 18, 35, 19, 23, 55, 29, 58, 57, 34...
# check if outliers were removed
summary(data_tib)
##        id             Sex      date-of-birth        total-sessions 
##  Min.   :   1.0   Female:744   Min.   :1941-02-09   Min.   : 5.00  
##  1st Qu.: 303.8   Male  :364   1st Qu.:1966-01-20   1st Qu.: 8.00  
##  Median : 604.5                Median :1976-02-18   Median :12.00  
##  Mean   : 606.8                Mean   :1976-03-26   Mean   :11.64  
##  3rd Qu.: 912.2                3rd Qu.:1986-12-31   3rd Qu.:14.00  
##  Max.   :1212.0                Max.   :2004-10-21   Max.   :20.00  
##   therapy-year    Birth_year        age       
##  Min.   :1997   Min.   :1941   Min.   :13.00  
##  1st Qu.:2010   1st Qu.:1966   1st Qu.:27.00  
##  Median :2014   Median :1976   Median :36.00  
##  Mean   :2013   Mean   :1976   Mean   :37.15  
##  3rd Qu.:2017   3rd Qu.:1986   3rd Qu.:47.00  
##  Max.   :2020   Max.   :2004   Max.   :68.00
fig_no_outliers <- ggplot(data = data_tib, aes(x = age, y = `total-sessions`, color = Sex)) + 
    geom_point(alpha = 0.5) + 
    labs(
        title = "Visualisation of data set",
        subtitle = "Therapy sessions according to age and sex",
        x_lab= "Age",
        y_lab="Sessions")+
    theme_tq()

# Visualisation with outliers removed
fig_no_outliers

ggplotly(fig_no_outliers)
# Split up data into females and males
males_tib <- data_tib %>% 
    filter(Sex == "Male") %>% 
    glimpse()
## Observations: 364
## Variables: 7
## $ id               <dbl> 2, 12, 13, 22, 26, 28, 29, 31, 33, 34, 42, 43, 46,...
## $ Sex              <fct> Male, Male, Male, Male, Male, Male, Male, Male, Ma...
## $ `date-of-birth`  <date> 1972-01-29, 1965-05-31, 1990-06-24, 1968-06-07, 2...
## $ `total-sessions` <dbl> 18, 13, 15, 13, 12, 14, 8, 15, 13, 17, 6, 16, 10, ...
## $ `therapy-year`   <dbl> 2019, 2020, 2019, 2020, 2019, 2017, 2019, 2019, 20...
## $ Birth_year       <dbl> 1972, 1965, 1990, 1968, 2004, 1970, 1988, 1974, 19...
## $ age              <dbl> 47, 55, 29, 52, 15, 47, 31, 45, 51, 46, 32, 40, 35...
females_tib <- data_tib %>% 
    filter(Sex =="Female") %>% 
    glimpse()
## Observations: 744
## Variables: 7
## $ id               <dbl> 1, 3, 4, 6, 7, 8, 10, 14, 16, 17, 18, 19, 20, 21, ...
## $ Sex              <fct> Female, Female, Female, Female, Female, Female, Fe...
## $ `date-of-birth`  <date> 1989-11-02, 1968-10-05, 1982-08-24, 2002-08-05, 1...
## $ `total-sessions` <dbl> 11, 20, 10, 14, 13, 10, 12, 12, 14, 20, 13, 13, 8,...
## $ `therapy-year`   <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2019, 2020, 20...
## $ Birth_year       <dbl> 1989, 1968, 1982, 2002, 1985, 2001, 1996, 1962, 19...
## $ age              <dbl> 31, 52, 38, 18, 35, 19, 23, 58, 57, 34, 16, 52, 58...
# Split data up into data and labels (prepare to apply machine learning model)
female_dataset <- females_tib %>% 
                select(age) %>% 
                glimpse()
## Observations: 744
## Variables: 1
## $ age <dbl> 31, 52, 38, 18, 35, 19, 23, 58, 57, 34, 16, 52, 58, 46, 21, 32,...
female_labels <- females_tib %>% 
                select(`total-sessions`) %>% 
                glimpse()
## Observations: 744
## Variables: 1
## $ `total-sessions` <dbl> 11, 20, 10, 14, 13, 10, 12, 12, 14, 20, 13, 13, 8,...
males_dataset <- males_tib %>% 
                select(age) %>% 
                glimpse()
## Observations: 364
## Variables: 1
## $ age <dbl> 47, 55, 29, 52, 15, 47, 31, 45, 51, 46, 32, 40, 35, 30, 26, 20,...
males_labels <- males_tib %>% 
                select(`total-sessions`) %>% 
                glimpse()
## Observations: 364
## Variables: 1
## $ `total-sessions` <dbl> 18, 13, 15, 13, 12, 14, 8, 15, 13, 17, 6, 16, 10, ...
library(tidyverse)
library(reticulate)
# set conda environment
use_condaenv("py3.8", required = TRUE)

# check if the correct environment is being used
py_config()
## python:         C:/Users/Susanna/Anaconda3/envs/py3.8/python.exe
## libpython:      C:/Users/Susanna/Anaconda3/envs/py3.8/python38.dll
## pythonhome:     C:/Users/Susanna/Anaconda3/envs/py3.8
## version:        3.8.3 (default, May 19 2020, 06:50:17) [MSC v.1916 64 bit (AMD64)]
## Architecture:   64bit
## numpy:          C:/Users/Susanna/Anaconda3/envs/py3.8/Lib/site-packages/numpy
## numpy_version:  1.18.1
## 
## NOTE: Python version was forced by use_python function
# is python working?
1+1
## 2
# XGBoost algorithm--------------------------------------------
# import python libraries here
# in conda terminal (with py3.8 env activated, install xgboost): pip install xgboost
import xgboost as xgb
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
# write tibble files to .csv files
female_dataset_py <- write_csv(female_dataset, 
                               path = "female_dataset_py.csv")
female_labels_py <- write_csv(female_labels,
                              path = "female_labels_py.csv")
males_dataset_py <- write_csv(males_dataset,
                              path = "males_dataset_py.csv")
males_labels_py <- write_csv(males_labels,
                             path = "males_labels_py.csv")
# Import the created .csv files
# Females
data_female = pd.read_csv(r'C:/Users/Sanmari Vivier/Dropbox/Personal/r-shiny-app/r-markdown-files/female_dataset_py.csv')
data_female_labels = pd.read_csv(r'C:/Users/Sanmari Vivier/Dropbox/Personal/r-shiny-app/r-markdown-files/female_labels_py.csv')
female_dataset_py = pd.DataFrame(data_female)
female_labels_py = pd.DataFrame(data_female_labels)

# Check to see if it is working
print (female_dataset_py.head(), female_labels_py.head())
##    age
## 0   31
## 1   52
## 2   38
## 3   18
## 4   35    total-sessions
## 0              11
## 1              20
## 2              10
## 3              14
## 4              13
# Import the created .csv files----------------------------------
# Males
data_male = pd.read_csv(r'C:/Users/Sanmari Vivier/Dropbox/Personal/r-shiny-app/r-markdown-files/males_dataset_py.csv')
data_male_labels = pd.read_csv(r'C:/Users/Sanmari Vivier/Dropbox/Personal/r-shiny-app/r-markdown-files/males_labels_py.csv')
male_dataset_py = pd.DataFrame(data_male)
male_labels_py = pd.DataFrame(data_male_labels)

# Check to see if it is working
print (male_dataset_py.head(), male_labels_py.head())
##    age
## 0   47
## 1   55
## 2   29
## 3   52
## 4   15    total-sessions
## 0              18
## 1              13
## 2              15
## 3              13
## 4              12
# Split up the data into training and testing sets
# Females

females_data_train, females_data_test = train_test_split(female_dataset_py,
                 test_size=0.33, 
                 random_state=42)
females_labels_train, females_labels_test = train_test_split(female_labels_py,
                 test_size=0.33, 
                 random_state=42)
# Split up the data into training and testing sets
# Males

males_data_train, males_data_test = train_test_split(male_dataset_py,
                 test_size=0.33, 
                 random_state=42)
males_labels_train, males_labels_test = train_test_split(male_labels_py,
                 test_size=0.33, 
                 random_state=42)

# Check if the data is split                 
print(males_labels_train)
##      total-sessions
## 118               9
## 31               13
## 36               13
## 284               6
## 181              12
## ..              ...
## 71                9
## 106               6
## 270              12
## 348               7
## 102               6
## 
## [243 rows x 1 columns]

Model 1: XGBoost Classifier algorithm————————

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Change the data shape of the labels
females_labels_train = np.ravel(females_labels_train)
females_labels_test = np.ravel(females_labels_test)
males_labels_train = np.ravel(males_labels_train)
males_labels_test = np.ravel(males_labels_test)

# Train the model
model = XGBClassifier(max_depth=6,
                    eta=0.01,
                    gamma=4,
                    min_child_weight=6,
                    subsample=0.8,
                    #silent=0,
                    objective='binary:logistic',
                    n_estimators=5,
                    seed=1729
                    )
                                        
model_xgb_f = model
model_xgb_m = model

model_xgb_f.fit(females_data_train, females_labels_train)
## XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
##               colsample_bynode=1, colsample_bytree=1, eta=0.01, gamma=4,
##               gpu_id=-1, importance_type='gain', interaction_constraints='',
##               learning_rate=0.00999999978, max_delta_step=0, max_depth=6,
##               min_child_weight=6, missing=nan, monotone_constraints='()',
##               n_estimators=5, n_jobs=0, num_parallel_tree=1,
##               objective='multi:softprob', random_state=1729, reg_alpha=0,
##               reg_lambda=1, scale_pos_weight=None, seed=1729, subsample=0.8,
##               tree_method='exact', validate_parameters=1, verbosity=None)
model_xgb_m.fit(males_data_train, males_labels_train)

# Test model
## XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
##               colsample_bynode=1, colsample_bytree=1, eta=0.01, gamma=4,
##               gpu_id=-1, importance_type='gain', interaction_constraints='',
##               learning_rate=0.00999999978, max_delta_step=0, max_depth=6,
##               min_child_weight=6, missing=nan, monotone_constraints='()',
##               n_estimators=5, n_jobs=0, num_parallel_tree=1,
##               objective='multi:softprob', random_state=1729, reg_alpha=0,
##               reg_lambda=1, scale_pos_weight=None, seed=1729, subsample=0.8,
##               tree_method='exact', validate_parameters=1, verbosity=None)
pred_xgb_f = model_xgb_f.predict(females_data_test)
pred_xgb_m = model_xgb_m.predict(males_data_test)
xgb_acc_f = accuracy_score(females_labels_test, pred_xgb_f)
xgb_acc_m = accuracy_score(males_labels_test, pred_xgb_m)
print("\n")
print("XGB model accuracy for females: {:.3f} %\n".format(xgb_acc_f * 100))
## XGB model accuracy for females: 10.163 %
print("XGB model accuracy for males: {:.3f} %\n".format(xgb_acc_m * 100))
## XGB model accuracy for males: 11.570 %

Model 2: XGBoost Regressor algorithm————————

# Model 2: XGB Regressor
# Here we will be calculating the loss using RMSE (root-mean-square-error)
from sklearn.metrics import mean_squared_error

model2 = xgb.XGBRegressor(objective='reg:squarederror',                                  n_estimators=12,
                          seed=123,
                          learning_rate=0.45
                          )

# Train the model
model_xgbR_f = model2
model_xgbR_m = model2

model_xgbR_f.fit(females_data_train, females_labels_train, eval_metric='rmse')
## XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
##              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
##              importance_type='gain', interaction_constraints='',
##              learning_rate=0.45, max_delta_step=0, max_depth=6,
##              min_child_weight=1, missing=nan, monotone_constraints='()',
##              n_estimators=12, n_jobs=0, num_parallel_tree=1,
##              objective='reg:squarederror', random_state=123, reg_alpha=0,
##              reg_lambda=1, scale_pos_weight=1, seed=123, subsample=1,
##              tree_method='exact', validate_parameters=1, verbosity=None)
model_xgbR_m.fit(males_data_train, males_labels_train, eval_metric='rmse')

# Test model
## XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
##              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
##              importance_type='gain', interaction_constraints='',
##              learning_rate=0.45, max_delta_step=0, max_depth=6,
##              min_child_weight=1, missing=nan, monotone_constraints='()',
##              n_estimators=12, n_jobs=0, num_parallel_tree=1,
##              objective='reg:squarederror', random_state=123, reg_alpha=0,
##              reg_lambda=1, scale_pos_weight=1, seed=123, subsample=1,
##              tree_method='exact', validate_parameters=1, verbosity=None)
pred_xgbR_f = model_xgbR_f.predict(females_data_test)
pred_xgbR_m = model_xgbR_m.predict(males_data_test)
rmse_f = np.sqrt(mean_squared_error(females_labels_test, pred_xgbR_f))
rmse_m = np.sqrt(mean_squared_error(males_labels_test, pred_xgbR_m))
print("\n")
print("RMSE for females: {:.3f} \n".format(rmse_f))
## RMSE for females: 4.222
print("RMSE for males: {:.3f} \n".format(rmse_m))
## RMSE for males: 3.740

Model 3: Naive Bayes algorithm————————

from sklearn.naive_bayes import GaussianNB

# Change the data shapes for the labels using np.ravel()
# Train set
females_labels_train_nb = np.ravel(females_labels_train)
males_labels_train_nb = np.ravel(males_labels_train)

# Test set
females_labels_test_nb = np.ravel(females_labels_test)
males_labels_test_nb = np.ravel(males_labels_test)

model3 = GaussianNB(var_smoothing=0.000000001)

# Train the model
model_nb_f = model3
model_nb_m = model3

model_nb_f.fit(females_data_train, females_labels_train_nb)
## GaussianNB(priors=None, var_smoothing=1e-09)
model_nb_m.fit(males_data_train, males_labels_train_nb)

# Test model
## GaussianNB(priors=None, var_smoothing=1e-09)
pred_nb_f = model_nb_f.predict(females_data_test)
pred_nb_m = model_nb_m.predict(males_data_test)
nb_acc_f = accuracy_score(females_labels_test_nb, pred_nb_f)
nb_acc_m = accuracy_score(males_labels_test_nb, pred_nb_m)
print("\n")
print("NB model accuracy for females: {:.3f} %\n".format(nb_acc_f * 100))
## NB model accuracy for females: 11.382 %
print("NB model accuracy for males: {:.3f} %\n".format(nb_acc_m * 100))
## NB model accuracy for males: 13.223 %

Model 4: Support Vector Machine algorithm——————

#from sklearn.svm import SVC
from sklearn.svm import LinearSVC

# Change the data shapes for the labels using np.ravel()
# Train set
females_labels_train_svm = np.ravel(females_labels_train)
males_labels_train_svm = np.ravel(males_labels_train)

# Test set
females_labels_test_svm = np.ravel(females_labels_test)
males_labels_test_svm = np.ravel(males_labels_test)

model4 = LinearSVC(C = 1000, 
                    max_iter=1000000,
                    dual=False
                    )

# Train the model
model_svm_f = model4
model_svm_m = model4

model_svm_f.fit(females_data_train, females_labels_train_svm)
## LinearSVC(C=1000, class_weight=None, dual=False, fit_intercept=True,
##           intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
##           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
##           verbose=0)
model_svm_m.fit(males_data_train, males_labels_train_svm)

# Test model
## LinearSVC(C=1000, class_weight=None, dual=False, fit_intercept=True,
##           intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
##           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
##           verbose=0)
pred_svm_f = model_svm_f.predict(females_data_test)
pred_svm_m = model_svm_m.predict(males_data_test)
svm_acc_f = accuracy_score(females_labels_test_svm, pred_svm_f)
svm_acc_m = accuracy_score(males_labels_test_svm, pred_svm_m)
print("\n")
print("SVM model accuracy for females: {:.3f} %\n".format(svm_acc_f * 100))
## SVM model accuracy for females: 10.163 %
print("SVM model accuracy for males: {:.3f} %\n".format(svm_acc_m * 100))
## SVM model accuracy for males: 12.397 %

Model 5: Logistic Regression algorithm——————

from sklearn.linear_model import LogisticRegression

model5 = LogisticRegression(max_iter=100000,
                            C=1,
                            solver='saga',
                            dual=False
                            )

# Train the model
model_lr_f = model5
model_lr_m = model5

model_lr_f.fit(females_data_train, females_labels_train_nb)
## LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
##                    intercept_scaling=1, l1_ratio=None, max_iter=100000,
##                    multi_class='auto', n_jobs=None, penalty='l2',
##                    random_state=None, solver='saga', tol=0.0001, verbose=0,
##                    warm_start=False)
model_lr_m.fit(males_data_train, males_labels_train_nb)

# Test model
## LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
##                    intercept_scaling=1, l1_ratio=None, max_iter=100000,
##                    multi_class='auto', n_jobs=None, penalty='l2',
##                    random_state=None, solver='saga', tol=0.0001, verbose=0,
##                    warm_start=False)
pred_lr_f = model_lr_f.predict(females_data_test)
pred_lr_m = model_lr_m.predict(males_data_test)
lr_acc_f = accuracy_score(females_labels_test_nb, pred_lr_f)
lr_acc_m = accuracy_score(males_labels_test_nb, pred_lr_m)
print("\n")
print("LR model accuracy for females: {:.3f} %\n".format(lr_acc_f * 100))
## LR model accuracy for females: 10.976 %
print("LR model accuracy for males: {:.3f} %\n".format(lr_acc_m * 100))
## LR model accuracy for males: 14.050 %